*** Table 9 Panel A ***

** This do.file builds our "reduced-form" markup measures, using 
** the grocery store data 
global db "D:\Dropbox\unequal_gains\main_data"
global db2 "D:\Dropbox\unequal_gains\QJE revision plan\analysis"
global Section4 "D:\Dropbox\unequal_gains\QJE revision plan\analysis\section4_data"
global resultspath "D:\Dropbox\unequal_gains\QJE revision plan\analysis\clean_results"
global datapath1 "/share/PI/chetty/are.berkeley.edu/Xavier/Safeway/US_weekly/" 
global datapathpath2 "D:\Dropbox\unequal_gains\stanford_data\Safeway\US_weekly"

/* 1. Build grocery store data from source files, 
* with price/wholesale cost/margin variables
cd "$datapath1"
set more off 

foreach i of numlist 2004(1)2007 {
use us_`i'.dta, clear
drop year week
drop if net_amt<0 | net_amt==0 | missing(net_amt)
drop if gross_amt<0 | gross_amt==0 | missing(gross_amt)
drop if avt_cost<0 | avt_cost==0 | missing(avt_cost)
drop if item_qty<0 | item_qty==0 | missing(item_qty)
* we drop the items with random weights because it's not clear how to deal with them (this is only 5.8% of items)
keep if meas_qty==0

gen cost=item_qty*avt_cost
collapse (sum) net_amt gross_amt cost item_qty (mean) category, by(store_id upc)

gen avg_markup=(net_amt-cost)/net_amt*100
gen avg_margin=(net_amt-cost)/cost*100

gen avg_unit_price=net_amt/item_qty
gen avg_unit_cost=cost/item_qty

gen avg_unit_gross_price=gross_amt/item_qty
gen avg_gross_markup=(gross_amt-cost)/net_amt*100
gen avg_gross_margin=(gross_amt-cost)/cost*100

save price_data_`i'.dta, replace
}
* Note that we've multiplied the margin by 100
*/

*** 2. Produce summary statistics on markup data and restrict sample (trimming out outliers)
cd "$datapath2"

* Include all years 2004 -2006
use price_data_2004.dta, clear
gen year=2004
append using price_data_2005.dta
replace year=2005 if missing(year)
append using price_data_2006.dta
replace year=2006 if missing(year)
* note: we don't include 2007 since there are seasonality effects and the 2007 data stops in June

sum avg_unit_price, d
sum avg_unit_price [aw=net_amt], d
gen p99_unit_price=r(p99)
gen p1_unit_price=r(p1)

sum avg_unit_cost, d
sum avg_unit_cost [aw=net_amt], d
gen p99_unit_cost=r(p99)
gen p1_unit_cost=r(p1)

drop avg_markup
gen avg_markup=avg_unit_price/avg_unit_cost
sum avg_markup [aw=net_amt], d
gen p99_markup=r(p99)
gen p1_markup=r(p1)

gen trimmed=0
foreach i in unit_price unit_cost markup {
replace trimmed=1 if avg_`i'>p99_`i' |  avg_`i'<p1_`i'
}
tab trimmed
* we are trimming 3.6% of the sample
save "$Section4/markup_data", replace

*** 3. Bring info on quality rank by module & consumer income 

** 3-i) Classify UPCs in markup dataset in modules by price deciles 

* HMS data
* 2004-2006 for HMS was done in 4b, but without preserving product module code' now we add other years and 
* create a full registry to match to the grocery store data 
foreach i of numlist 2004(1)2015 {

use "$db/Important Price Datasets/price_index_household_income_`i'_2004_modules.dta", clear

collapse (sum) total_spending total_quantity [aw=projection_factor], by(upc upc_ver_uc product_module_code ///
product_group_code department_code)
gen average_unit_price_all=total_spending/total_quantity

* bring in info on UPC price decile (within product module) in base period:
merge m:1 upc upc_ver_uc using "$db/Important Lists/upc_brand_size_final.dta"
keep if _merge==3
drop _merge
sum size1_amount, d
drop if size1_amount<r(p1) | size1_amount>r(p99)
gen average_unit_price_adj=average_unit_price_all/(size1_amount*multi)
* create quality ranks without weights
sort product_module_code average_unit_price_adj
bysort product_module_code: gen double temp=[_N]
bysort product_module_code: gen double temp2=[_n]
gen rank=temp2/temp*100
gen quality_rank=.
foreach r of numlist 1(1)10 {
replace quality_rank=`r' if rank<=`r'*10 & missing(quality_rank)
}

keep upc upc_ver_uc quality_rank product_module_code product_group_code department_code
duplicates drop
save "$db/Important Price Datasets/UPCqualitydeciles_`i'.dta", replace
}

* RMS data
cd "$db2\RMS\collapsed_files\state_level"

foreach i of numlist 2006(1)2015 {

use state_`i', clear

collapse (sum) total_spending total_quantity, by(upc product_module_code ///
product_group_code)
gen average_unit_price_all=total_spending/total_quantity
gen upc_ver_uc=1

* bring in info on UPC price decile (within product module) in base period:
merge m:1 upc upc_ver_uc using "$db/Important Lists/upc_brand_size_final.dta"
keep if _merge==3
drop _merge
sum size1_amount, d
drop if size1_amount<r(p1) | size1_amount>r(p99)
gen average_unit_price_adj=average_unit_price_all/(size1_amount*multi)
* create quality ranks without weights
sort product_module_code average_unit_price_adj
bysort product_module_code: gen double temp=[_N]
bysort product_module_code: gen double temp2=[_n]
gen rank=temp2/temp*100
gen quality_rank=.
foreach r of numlist 1(1)10 {
replace quality_rank=`r' if rank<=`r'*10 & missing(quality_rank)
}

keep upc upc_ver_uc quality_rank product_module_code product_group_code
duplicates drop
save "$db/Important Price Datasets/UPCqualitydeciles_`i'_RMS.dta", replace
}

*** 3-ii) Bring in info on consumer income in each module by price deciles
* note: this was computed in 4e ; note that here we only keep the data prior to 2007
* (given availability of grocery store data up to that year)
use "$Section4/consumerincome_QM", clear
drop if year>2006
collapse (sum) inc_totalspen_QM totalspen_QM, by(product_module_code)
gen double consumer_income_salesw_module=inc_totalspen_QM/totalspen_QM
save "$Section4/consumerincome_M_collapsed", replace

*** 4. Match datasets and finalize data

* A. Match
use "$db/Important Price Datasets/UPCqualitydeciles_2004.dta", clear
foreach i of numlist 2005(1)2015 {
append using "$db/Important Price Datasets/UPCqualitydeciles_`i'.dta"
}
foreach i of numlist 2006(1)2015 {
append using "$db/Important Price Datasets/UPCqualitydeciles_`i'_RMS.dta"
}
* keep just one classification for each upc (the most frequent)
keep upc product_module_code product_group_code department_code quality_rank
bysort upc quality_rank: gen dup=[_N]
gsort upc -dup
by upc: gen DUP=[_n] 
drop if DUP>1
drop DUP dup

merge 1:m upc using "$Section4/markup_data"
keep if _merge==3
* 94% of UPCs in grocery data find a match 
drop _merge

* merge to consumer income (first to quality module, then to consumer income for this quality module) 
merge m:1 product_module using "$Section4/consumerincome_M_collapsed"
keep if _merge==3 
drop _merge
* we lose another 3.5% of the data

* check we have departments for all observations (some are missing from RMS data)
merge m:1 product_group using  "$db/Important Lists/groups_dep.dta", keepusing(department) update
drop if _merge==2
drop _merge

* B. create variables we need 

* Compute log price change, log cost change, margin change and market share change
egen upc_store=group(upc store_id)
tsset upc_store year
by upc_store: gen count=[_N]
sum count, d
drop count
* doesn't seem too unbalanced

* note: we scale the log change by 100 so that it matches the scale of the margin change
gen l_avg_unit_cost=log(avg_unit_cost)
gen l_avg_unit_price=log(avg_unit_price)
gen l_avg_markup=log(avg_markup)

gen log_change_avg_unit_price=(l_avg_unit_price-L.l_avg_unit_price)*100
gen log_change_avg_unit_cost=(l_avg_unit_cost-L.l_avg_unit_cost)*100
gen log_change_avg_markup=(l_avg_markup-L.l_avg_markup)*100

gen avg_spending_weights=(net_amt+L.net_amt)/2

drop if missing(log_change_avg_unit_price) | missing(log_change_avg_unit_cost) ///
| missing(log_change_avg_markup) | missing(avg_spending_weights)

save "$Section4/markup_data_final", replace


